import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
%matplotlib inline
from collections import Counter
import scipy.stats as sts
from datetime import date
from tqdm import tqdm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
C:\Users\lavre\anaconda3\Lib\site-packages\paramiko\transport.py:219: CryptographyDeprecationWarning: Blowfish has been deprecated "class": algorithms.Blowfish,
Загрузим датасет прослушиваний музыки с kaggle (ссылка - https://www.kaggle.com/competitions/kkbox-music-recommendation-challenge/data) и проведем exploratory data analysis для каждой из таблиц.
songs_df = pd.read_csv('data/songs.csv')
song_extra_info_df = pd.read_csv('data/song_extra_info.csv')
members_df = pd.read_csv('data/members.csv')
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
sample_submission_df = pd.read_csv('data/sample_submission.csv')
Таблица содержит информацию о музыкальном треке и имеет следующие поля:
songs_df.sample(10)
| song_id | song_length | genre_ids | artist_name | composer | lyricist | language | |
|---|---|---|---|---|---|---|---|
| 1753532 | 5adIWoR3U8LISjrFKxTE3QzhtrIcf5bblf6JTz8uiW4= | 266240 | 465 | Elka | ELKA | wit☆ | 17.0 |
| 1024319 | Mg5gczbqI9pSGzIVfSJ5TN4vNusNW2gANCY/Cb/t0kQ= | 191147 | 139|125|109 | The Blenders | NaN | NaN | 52.0 |
| 1591547 | w1//42G5PhFARMqGR8VW3KRjf3AwPsbMztuXAadxX4k= | 222540 | 1259 | MC Yogi | NaN | NaN | 52.0 |
| 2242455 | p9hhn7rY2R37BUkHqReK7GlVoU3rPIcuCiXTz0RdUeM= | 344119 | 2122 | KEI SASAKI (佐々木 慧) | Vernon Duke | NaN | 17.0 |
| 495174 | j1I3fvvuE+patqq/izI5oC5bYAcKwnPbnrLF3EqXLrg= | 229134 | 359 | Dia Frampton | Dia Frampton| Tim Anderson| Daniel Heath | NaN | 52.0 |
| 2085618 | o/ZV5YCnS8gOr1hmjxJ8L8y5Zx9J7sex19w+p99QpAc= | 323686 | 940 | Pianissimo Brothers | NaN | NaN | -1.0 |
| 1875146 | MhAjo8k858FG+1Yr1+LkmyvVSwEASQuI0hgzE1DBrwI= | 114149 | 1981 | The Highwaymen | Rossi | NaN | 52.0 |
| 1232832 | +90ME5mnrp6s23j1yw4wTTSFMqbipxAdhTkuEgvBpeo= | 190682 | 465 | Fatman Scoop | NaN | NaN | 52.0 |
| 2234228 | cESbN7rydnt3aIf6NGnfUHKEAtVWnGNWRB9dFmv1phc= | 273449 | NaN | Statue | NaN | NaN | 52.0 |
| 1238373 | 7HpJvyIEsOrfHSi6WVjkS5Hi+/sGPhvM5xX6Cdz3Lx8= | 260063 | 2022 | Backtrack Professional Karaoke Band | NaN | NaN | 52.0 |
songs_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2296320 entries, 0 to 2296319 Data columns (total 7 columns): # Column Dtype --- ------ ----- 0 song_id object 1 song_length int64 2 genre_ids object 3 artist_name object 4 composer object 5 lyricist object 6 language float64 dtypes: float64(1), int64(1), object(5) memory usage: 122.6+ MB
print(f'Количество дупликатов в таблице songs_df: {songs_df.duplicated().sum()}')
Количество дупликатов в таблице songs_df: 0
Построим датафрэйм, в котором отразим количество пропусков и уникальных значений, как в абсолютном, так и в процентном соотношении.
songs_nan_df = pd.DataFrame(dict(songs_df.isna().sum()).items(), columns=['column', 'misses_cnt'])
songs_nan_df['misses_percent'] = round((songs_nan_df['misses_cnt'] / songs_df.shape[0]) * 100, 2)
songs_nan_df['unique_cnt'] = dict(songs_df.nunique()).values()
songs_nan_df['unique_percent'] = round((songs_nan_df['unique_cnt'] / songs_df.shape[0]) * 100, 2)
songs_nan_df
| column | misses_cnt | misses_percent | unique_cnt | unique_percent | |
|---|---|---|---|---|---|
| 0 | song_id | 0 | 0.00 | 2296320 | 100.00 |
| 1 | song_length | 0 | 0.00 | 146534 | 6.38 |
| 2 | genre_ids | 94116 | 4.10 | 1045 | 0.05 |
| 3 | artist_name | 0 | 0.00 | 222363 | 9.68 |
| 4 | composer | 1071358 | 46.66 | 329822 | 14.36 |
| 5 | lyricist | 1945306 | 84.71 | 110924 | 4.83 |
| 6 | language | 1 | 0.00 | 10 | 0.00 |
Рассмотрим количество уникальных жанров в таблице.
unique_genres = []
for x in songs_df.genre_ids.dropna():
if isinstance(x, str):
unique_genres.extend(x.split('|'))
else:
unique_genres.append(x)
fig = px.bar(
pd.DataFrame(Counter(unique_genres).items(), columns=['genre_ids', 'count']),
x='genre_ids',
y='count',
text_auto='outside',
title='<i><b>Genres count</b></i>',
)
fig.update_xaxes(categoryorder='total descending')
fig.update_layout(title_x=0.5)
fig.show()
Наблюдается яркий перекос в сторону 10 самых популярных с убывающем трендом по остальным жанрам.
fig = px.histogram(
songs_df.song_length,
x='song_length',
text_auto='outside',
title='<i><b>Songs length distribution</b></i>',
)
fig.update_xaxes(categoryorder='total descending')
fig.update_layout(title_x=0.5)
fig.show()